package com.hrhx.springboot.crawler;

import cn.hutool.core.text.csv.CsvUtil;
import cn.hutool.core.text.csv.CsvWriter;
import cn.hutool.core.util.CharsetUtil;
import com.alibaba.fastjson.JSON;
import com.hrhx.springboot.domain.StockInfo;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.concurrent.TimeUnit;
import java.util.regex.Matcher;
import java.util.regex.Pattern;


/**
 * @author duhongming
 * @version 1.0
 * @description TODO
 * @date 2020-03-10 21:07
 */
public class StockCrawler {

    public static final String STOCK_API = "http://vip.stock.finance.sina.com.cn/quotes_service/api/json_v2.php/Market_Center.getHQNodeData?page=${page}&num=80&sort=changepercent&asc=0&node=hs_a&symbol=&_s_r_a=page";
    public static final String COMPANY_API = "http://vip.stock.finance.sina.com.cn/corp/go.php/vCI_CorpInfo/stockid/${stockid}.phtml";

    private static final Pattern PROVINCE_PATTERN = Pattern.compile("[\\u4e00-\\u9fa5]+(?=省)");
    private static final Pattern CITY_PATTERN = Pattern.compile("[\\u4e00-\\u9fa5]+(?=市)");

    public static void main(String[] args) throws IOException, InterruptedException {
        handleData();
    }

    public static void handleData() throws IOException, InterruptedException {
        //指定路径和编码
        CsvWriter writer = CsvUtil.getWriter("/Users/admin/JavaProject/stock10.csv", CharsetUtil.CHARSET_UTF_8);
        List<StockInfo> data = new ArrayList<>();

        for (int i = 1; i <= 10; i++) {
            Document stockDoc = Jsoup.connect(STOCK_API.replace("${page}", String.valueOf(i))).get();
            String stockJson = stockDoc.text();
            List<StockInfo> stockInfoList = JSON.parseArray(stockJson, StockInfo.class);
            TimeUnit.SECONDS.sleep(1);
            for (StockInfo stockInfo : stockInfoList) {
                TimeUnit.SECONDS.sleep(1);
                Document companyDoc = Jsoup.connect(COMPANY_API.replace("${stockid}", stockInfo.getCode())).get();
                String ipoDate = companyDoc.select("#comInfo1 > tbody > tr:nth-child(3) > td:nth-child(4) > a").text();
                stockInfo.setIpoDate(ipoDate);
                String companyAddress = companyDoc.select("#comInfo1 > tbody > tr:nth-child(18) > td.ccl").text();
                stockInfo.setCompanyAddress(companyAddress);

                Matcher provinceMatcher = PROVINCE_PATTERN.matcher(companyAddress);
                Matcher cityMatcher = CITY_PATTERN.matcher(companyAddress);
                if (provinceMatcher.find()) {
                    stockInfo.setCompanyProvince(provinceMatcher.group());
                } else if (cityMatcher.find()) {
                    stockInfo.setCompanyProvince(cityMatcher.group());
                } else {
                    stockInfo.setCompanyProvince(companyAddress.substring(0, 2));
                }
                System.out.println("爬取第" + i + "页");
            }
            data.addAll(stockInfoList);
        }
        writer.write(data);
    }
}

